{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>query1</th>\n",
       "      <th>query2</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>咳血</td>\n",
       "      <td>剧烈运动后咯血,是怎么了?</td>\n",
       "      <td>剧烈运动后咯血是什么原因？</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>咳血</td>\n",
       "      <td>剧烈运动后咯血,是怎么了?</td>\n",
       "      <td>剧烈运动后为什么会咯血？</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>咳血</td>\n",
       "      <td>剧烈运动后咯血,是怎么了?</td>\n",
       "      <td>剧烈运动后咯血，应该怎么处理？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>咳血</td>\n",
       "      <td>剧烈运动后咯血,是怎么了?</td>\n",
       "      <td>剧烈运动后咯血，需要就医吗？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>咳血</td>\n",
       "      <td>剧烈运动后咯血,是怎么了?</td>\n",
       "      <td>剧烈运动后咯血，是否很严重？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8748</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>过敏性哮喘吃什么药管用</td>\n",
       "      <td>过敏性哮喘吃什么药有效果？</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8749</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>过敏性哮喘吃什么药管用</td>\n",
       "      <td>过敏性哮喘吃什么药效果好的快？</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8750</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>过敏性哮喘吃什么药管用</td>\n",
       "      <td>过敏性哮喘吃什么水果好？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8751</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>过敏性哮喘吃什么药管用</td>\n",
       "      <td>过敏性哮喘能治好吗？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8752</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>过敏性哮喘吃什么药管用</td>\n",
       "      <td>过敏性哮喘吃什么药副作用最小？</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8753 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     category         query1           query2  label\n",
       "0          咳血  剧烈运动后咯血,是怎么了?    剧烈运动后咯血是什么原因？    1.0\n",
       "1          咳血  剧烈运动后咯血,是怎么了?     剧烈运动后为什么会咯血？    1.0\n",
       "2          咳血  剧烈运动后咯血,是怎么了?  剧烈运动后咯血，应该怎么处理？    0.0\n",
       "3          咳血  剧烈运动后咯血,是怎么了?   剧烈运动后咯血，需要就医吗？    0.0\n",
       "4          咳血  剧烈运动后咯血,是怎么了?   剧烈运动后咯血，是否很严重？    0.0\n",
       "...       ...            ...              ...    ...\n",
       "8748       哮喘    过敏性哮喘吃什么药管用    过敏性哮喘吃什么药有效果？    1.0\n",
       "8749       哮喘    过敏性哮喘吃什么药管用  过敏性哮喘吃什么药效果好的快？    1.0\n",
       "8750       哮喘    过敏性哮喘吃什么药管用     过敏性哮喘吃什么水果好？    0.0\n",
       "8751       哮喘    过敏性哮喘吃什么药管用       过敏性哮喘能治好吗？    0.0\n",
       "8752       哮喘    过敏性哮喘吃什么药管用  过敏性哮喘吃什么药副作用最小？    0.0\n",
       "\n",
       "[8753 rows x 4 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "train_df = pd.read_csv(\"origin-data/train.csv\")\n",
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8753 entries, 0 to 8752\n",
      "Data columns (total 4 columns):\n",
      " #   Column    Non-Null Count  Dtype  \n",
      "---  ------    --------------  -----  \n",
      " 0   category  8753 non-null   object \n",
      " 1   query1    8753 non-null   object \n",
      " 2   query2    8753 non-null   object \n",
      " 3   label     8747 non-null   float64\n",
      "dtypes: float64(1), object(3)\n",
      "memory usage: 273.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8753"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 1., 0., 0., 0., 0., 0., 0.])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "tt = OneHotEncoder().fit_transform(train_df[\"category\"].values.reshape(-1, 1)).toarray()\n",
    "tt[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 统计query长度的分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x116261a10>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEGCAYAAAB1iW6ZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAe50lEQVR4nO3deXxU9b3/8ddnJnvIQkgIgQQCiuwgEqG2tsW6XO2itUWr9trN1t62tna57a/319/t4r1drX108/anbX3Uthet9Xah1Vb7s7aiVgQUQUAEIUBYs0BCErJM5vP7YwYMkJABJpnkzPv5eOTBzDnfzHxyOPPON99zzveYuyMiIiNfKNUFiIhIcijQRUQCQoEuIhIQCnQRkYBQoIuIBERGqt64tLTUq6urU/X2IiIj0urVqxvcvayvdSkL9OrqalatWpWqtxcRGZHMbHt/6zTkIiISEAp0EZGAUKCLiASEAl1EJCAU6CIiAaFAFxEJCAW6iEhAKNBFRAJCgS4iEhApu1JUho+lK3acsOyGRRNTUImInAn10EVEAkKBLiISEAp0EZGAUKCLiASEAl1EJCAU6CIiAaFAFxEJCAW6iEhAKNBFRAJCgS4iEhAKdBGRgNBcLnLG+poLBjQfjMhQUw9dRCQgFOgiIgGhQBcRCQgFuohIQCjQRUQCQoEuIhIQCnQRkYBQoIuIBIQCXUQkIBToIiIBoUAXEQkIBbqISEAo0EVEAkKBLiISEAp0EZGASCjQzexyM9tkZlvM7PMnafdOM3Mzq0leiSIikogBA93MwsCdwBXATOB6M5vZR7sC4FZgRbKLFBGRgSXSQ18IbHH3re7eBdwPXNVHu/8Avgl0JLE+ERFJUCKBPgHY2et5XXzZUWZ2HlDl7g+d7IXM7GYzW2Vmq+rr60+5WBER6d8ZHxQ1sxDwHeAzA7V197vdvcbda8rKys70rUVEpJdEbhK9C6jq9bwyvuyIAmA28DczAxgHLDOzK919VbIKlWDr60bTusm0yKlJpIe+EphqZpPNLAu4Dlh2ZKW7N7t7qbtXu3s18AygMBcRGWIDBrq7R4BbgEeAjcAD7r7ezG4zsysHu0AREUlMIkMuuPvDwMPHLftiP20Xn3lZIiJyqnSlqIhIQCjQRUQCQoEuIhIQCnQRkYBQoIuIBIQCXUQkIBToIiIBoUAXEQkIBbqISEAo0EVEAkKBLiISEAp0EZGAUKCLiASEAl1EJCAU6CIiAaFAFxEJCAW6iEhAKNBFRAJCgS4iEhAKdBGRgFCgi4gEhAJdRCQgFOgiIgGhQBcRCQgFuohIQCjQRUQCQoEuIhIQCnQRkYBQoIuIBIQCXUQkIBToIiIBoUAXEQkIBbqISEAo0EVEAkKBLiISEAkFupldbmabzGyLmX2+j/X/YmbrzGyNmT1pZjOTX6qIiJzMgIFuZmHgTuAKYCZwfR+BvdTd57j7ucC3gO8kvVIRETmpRHroC4Et7r7V3buA+4Grejdw95ZeT/MBT16JIiKSiIwE2kwAdvZ6XgcsOr6RmX0M+DSQBbyprxcys5uBmwEmTpx4qrWKiMhJJBLoCXH3O4E7zewG4P8A7+2jzd3A3QA1NTXqxctpWbpixwnLblikDoJIIkMuu4CqXs8r48v6cz/w9jMpSkRETl0igb4SmGpmk80sC7gOWNa7gZlN7fX0LcDm5JUoIiKJGHDIxd0jZnYL8AgQBu5x9/Vmdhuwyt2XAbeY2SVAN3CAPoZbRERkcCU0hu7uDwMPH7fsi70e35rkukRE5BTpSlERkYBQoIuIBIQCXUQkIJJ2HrrISNHXeeygc9ll5FMPXUQkIBToIiIBoUAXEQkIBbqISEAo0EVEAkKBLiISEAp0EZGAUKCLiASEAl1EJCAU6CIiAaFAFxEJCAW6iEhAKNBFRAJCgS4iEhAKdBGRgFCgi4gEhAJdRCQgFOgiIgGhQBcRCQgFuohIQCjQRUQCQoEuIhIQCnQRkYBQoIuIBIQCXUQkIBToIiIBoUAXEQkIBbqISEAo0EVEAkKBLiISEAp0EZGASCjQzexyM9tkZlvM7PN9rP+0mW0ws7Vm9piZTUp+qSIicjIDBrqZhYE7gSuAmcD1ZjbzuGbPAzXuPhd4EPhWsgsVEZGTS6SHvhDY4u5b3b0LuB+4qncDd3/c3dvjT58BKpNbpoiIDCSRQJ8A7Oz1vC6+rD83AX/qa4WZ3Wxmq8xsVX19feJViojIgDKS+WJm9s9ADfDGvta7+93A3QA1NTWezPcWGQxLV+zoc/kNiyYOcSUiA0sk0HcBVb2eV8aXHcPMLgG+ALzR3TuTU56IiCQqkSGXlcBUM5tsZlnAdcCy3g3MbD5wF3Clu+9PfpkiIjKQAQPd3SPALcAjwEbgAXdfb2a3mdmV8Wa3A6OAX5vZGjNb1s/LiYjIIEloDN3dHwYePm7ZF3s9viTJdYmIyCnSlaJygsbWTm786Qqe33Eg1aWIyClQoMsJVm0/wPLNDVz/42d4dP3ehL5n/6EOOrp7BrkyETkZBbqcYMPuFs6tKmbauEI+/MvV3Pt07Unb72xq5/uPbeb2Rzbx95fr6YpEh6ZQETmGAl2Osf9QB/WtnbzjvAnc96FFXDy9nC8tW8+yF3b32b61M8KvVu2kMCeTiSV5PLJ+L3c8uolX6luHuHIRUaDLMTbubgHgkhnl5GVl8H//+TzmVRXzlWXrOdjedUL7ryxbz4G2Lq6pqeK9r63mQ6+fQkbYeGjtHtx17ZjIUFKgyzE27GlhQnEu44tzAcgIh/jGO+bQfLibrz608Zi2f1q3h1+vruON08qYXJoPwOTSfC6aNpa9LR3845XGIa9fJJ0p0OWo5sPd7DxwmFnjC49ZPqOikA+9YQq/Xl3H01sa6IpEuffpWj734FrmVRZx8fTyY9rPqyomLyvMPU/VDmH1IpLUuVxkZNu4JzbcMqOi8IR1t148lYfX7eGzD64lFIKdTYd5zZQSbl8yj+WbG45pmxkOsWjyGB57aR+1DW1Ux3vvIjK41EOXozbuaaF0VBZjC7JPWJeTGeZrV89hd/NhCrIzufcDC7nvQ6+hqiSvz9daNKWEjJDxswHOkBGR5FEPXQA43NXDK/WtXHh2KWbWZ5vXnV3KM/92MWWjsgmF+m5zRGFOJm+bO54HV9fxmcvOoSAnczDKFpFe1EMXALbUtxL1vodbeisvzBkwzI94/+sm09oZ4b5n+56CFqChtZOv/2kjjyR4AZOI9E89dAFgX0sHBkyIn92SDHMqi3jDOWXc8ejLvPasUmZPKDq67kBbF3c9sZV7n67lcPwK0+ox+UwbV5C09xdJN+qhCxDrKRfnZZIRTu4u8Z1r5zEmP4sP/2I1ja2xafJXb2/i8u89wV1PvMJls8r58ydfz5j8LB5at4eeqM5dFzldCnQBoLG1izGjTjwYeqZKR2Vz1401NLR28rGlz/Gzp7Zx3d3PkJ0R5g+3XMj3rpvP9HGFvGVOBQ2tnTyzVeeui5wuDbkI7k5DayfzJxYPyuvPqSzi6++Yw6cfeIFntjZx0bQyvvuu+RTlvXqgdNq4AqaOHcVjL+1jXlUxo7KDs2v2dRs73cJOBkNwPjVy2tq6euiMRBmTn/we+hHvOK+SA+3d9ESjfPDCKSccWDUz3jKngu//dTP/b8M+3j7/ZPchF5G+KNDl6Nh26aisQX2fmy6cfNL1YwtzWDi5hGe3NXHJzPKTthWRE2kMXWhojU26NRhj6Kdq0eQxRB3W7DyY6lJERhwFutDQ2knIYHTe4PbQE1FemEPl6Fye235AszWKnCIFutDY2snovCzCCV4wNNgWTBrN3pYO1sen8hWRxCjQhca2LkqHwXDLEXMnFJMRMh5cXZfqUkRGFAV6mnP3+DnoqR9uOSI3K8yMikJ+t2YXnRHdp1QkUQr0NLf/UCddPdFh1UOH2LDLwfZu/rpxf6pLERkxFOhpbmt9G8Cw6qEDnD12FOWF2fxawy4iCVOgp7naxliglw7iRUWnI2TGkgWVPL5pP4+/pF66SCJ0YVGaq21oIxyyYy7DHy4+uvhsHn+pno/f9zwPfuQCpo87+dS+QXCm0wT09f2n+hoycqmHnua2NbQxJj+LUD83tUil/OwMfvq+GvKzw9z0s1XUH+okGnX2NB9m/e5m2jojqS5RZFhRDz3NbWtoGxZXiPanoiiXn7znfK696x9c+cMnaeuM0NIRC/KQQXVpPvMmFPOu86uGzXn0IqmiQE9j0aizvamdRdUlqS7lpOZUFvH96+fzvcdeZvG0Mrp7nNF5WWzed4gXd7fw2zW7mFyWzycunprqUkVSSoGexnY3H6YrEh3WPfQjLp1ZzqXxCbuOjBNPLs3n0pnlPLBqJ997bDNvPKeMeVWDMwWwyEigMfQ0VtvQDgz+LIuDycy4ct4ExhZk86kH1nC4SxciSfpSoKexbY1HzkEf/j30k8nNCvPta+axtb6Nr/9p40nb1ja0sXTFdlbWNg1RdSJDR0MuaWx7Qxs5mSEKc0b+bvC6s0v5wOsmc89T25gzoYhraqqOWd8Tde55cht3/GUTnd1RXtzdQmtnhMXnlGHD8AwfkdMx8j/JctpqG9uZVJIfmED73OXTeGlvC599cC2HOiJ8IH5DjdXbm7jtDxt4oa6ZS2aUc97EYh7dsI+/bNjH4a4erpg9LjDbQNKbAj2N7Whqo3pMfqrLSJqczDD3vO98PnHf89z2xw3saT7MzqbD/Hn9XsYWZPP96+fztrkV3PfsTpYsqCQ3M8yTWxrIDIeOHnAVGckSGkM3s8vNbJOZbTGzz/ex/g1m9pyZRcxsSfLLlGSLRp3tje1MGpOX6lKSKiczzH+9+zyWLKjkx8u3sXxzPZ++9Bz+9tnFXDlv/NGeeMiMt86tYG5lEU9uqadVFylJAAzYQzezMHAncClQB6w0s2XuvqFXsx3A+4B/HYwiJfn2HeqgMxJlUoB66EdkhEN8651zuWxmOfMnjqasoO+DvmbGJdPLWVfXzPLN9dz8hilDXKlIciXSQ18IbHH3re7eBdwPXNW7gbvXuvtaIDoINcog2N4YO2UxaD30I0Ih47JZ4/oN8yNKC7KZV1XMM1sbaYjfLFtkpEok0CcAO3s9r4svO2VmdrOZrTKzVfX19afzEpIk2+OnLAZpDP10LZ5WRqTH+fHyrakuReSMDOl56O5+t7vXuHtNWVnZUL61HGd7YzuZYaOiKCfVpaTc2IIc5lYW8Yt/bKdRvXQZwRIJ9F1A75N6K+PLZATb3thO5eg8MsK6tgzgouljOdzdw52Pv5LqUkROWyKf5pXAVDObbGZZwHXAssEtSwZbbWNbYMfPT8fYghyuXziRe57axmMb96W6HJHTMmCgu3sEuAV4BNgIPODu683sNjO7EsDMzjezOuAa4C4zWz+YRcuZcXd2NLZr/Pw4X3zrTGZPKOSTv1pDbUNbqssROWUJ/b3t7g+7+znufpa7fzW+7Ivuviz+eKW7V7p7vruPcfdZg1m0nJmmti4OdUaYWKIeem85mWF+9O4FhEPGv/xytSb6khFHV4qmodr4KYvVpQr041WV5PHdd53L+3+2krf+YDm5WRmMzs2ktCCbSSV5lBVka5oAGbYU6GloR1NsOCGIFxUlw+JpY7l9yTx+9/wuXtrbwkt7WohEHYC8rDCzJxRxbU2lDijLsKNAT0O1De2YQeXo3FSXMmwtWVDJkgWVLF2xA3ensbWL2sY2ttS38uy2Jv799+v52tWz1VuXYUWBnoZ2NLUzviiX7IxwqksZEcyM0oJsSguyqakuoTh3L/c9u4Oqklw+uvjsVJcncpQCPQ3plMUzc9mscorzMvnWnzcxoTiXq849rQunRZJOg4BpKDbLosbPT1fIjNuvmcuiySV89tdrWb1ddz+S4UGBnmZaOrppauuiWj30M5KdEeauGxdQUZzDh3/xHLsPHk51SSIK9HSzI+CzLA6l4rwsfvKeGjq6e7j5F6t03rqknAI9zbw6ba6GXJJhankB37vuXNbvbuHW+59ny/7WY9a7O909mlVahoYOiqaZ2sYj56Crh54sF88o5wtvnsF/PrSRRzfs46yyfM6vLmFHUzsv7T1EU1sXk0vzObeqmNnji8jN0tlFMjjUQ08zG/e0MKE4l7ws/S5Ppg++fgr/+Lc3cdtVsxhXlMPD6/bQ1tXDpTPKecPUMg51dPPb53fx9T9tZM3OA6kuVwJKn+o0s7aumXlVRakuI5AqinJ5zwXVvOeC6mOWL12xg3+aVc6ug4f504t7eWBVHTMqCnn/6yanplAJLPXQ08jB9i52NLUzt7I41aWkHTOjcnQe73ttNbPGF/KVP2zgjkc34e6pLk0CRD30NLK2rhmAuRPUQ0+VzHCI6xdO5MVdzfzgr1tYsbWJf3vzdOZPHH20TdSdA21d7G3poP5QJ6PzMrlkZjmZmjtGBqBATyPrdsUCfXalAj2VQmZ8/R1zmFtZzHf+8jJX/9fT/NOscgpzMvnH1kb2tXTQ3fNqz/3RDfsoHZXNkgWVvHvRRKr6mfbY3dnZ1E5rZ4SzykaRlaFfAOlGgZ5GXth5kCml+RTmZKa6lLRnZtywaCJXnTueu5/Yyk+WbyU3K0xRbibnV5cwrjCH8sIcygqymVyaz9Jnd/Dj5bF2V8+fwMcuOpvq0nx6os6+lg4ee2k/S1fsYOOeFgAyw8a0cYWcXz2aqWMLUvzTylBRoKeRdbuaWTS5JNVlSC/52Rl86tJzuPXiqYRCxtIVO05oc9H0sVw0fSx7mg9z9xNbWbpiB//zXB0VRbnsbemgJz6178yKQq46dzwl+Vms390S+9rVzIdeP2WofyxJEQV6mth/qIM9zR3M0QHRYSkUGnga3oqiXL70tll8ZPFZ/OypWvY2dzC+OJfxxbnMnlDInAlF3PfsTgCmji3g8lnj+MFfN/Pr1Tv5+MVnU6C/zAJPgZ4m1h05IKrx8xFvbEEOn7t8+oDtcjLDXFtTxd1PbOVLy9bznWvPHYLqJJV01CRNvFDXTMhg1vjCVJciQ2jSmHwWTxvLb57bxR/X7k51OTLIFOhpYl3dQaaOLdAVomnoTdPHMq+qmC/89kWa2rpSXY4MIgV6GnB31tY1a7glTYVDxu1L5nKoo5vvP7Y51eXIIFKgp4HdzR00tnUp0NPYOeUFXLdwIr98Zjtb61sH/gYZkRToaWDtzoMAuuQ/zX3qknPIzgjxzT+/lOpSZJAo0NPAmp0HyQwb0yt0gUk6KyvI5iOLz+KR9ftYsbUx1eXIIFCgB1x3T5TfrdnFBWeVkp2hebjT3U0XTmFcYQ5ffXgj0agmBgsaBXrAPbp+H/taOnnvBZNSXYoMA7lZYT5/xXTW1jXzo7+/kupyJMkU6AF379O1VJXksnja2FSXIsPEVeeO523zxnPHo5t4dltTqsuRJNJJyQG2YXcLz9Y28YU3zyCcwKXlkh7MjK9dPZt1dQf5xH3P8/Ctr6ckPwugz7lkAG5YNHEoS5TTpB56gP38H7XkZIa4pqYy1aXIMFOQk8kPbziPprYuPvPAGiK6kXUgKNAD6mB7F79bs4ur50+gOC8r1eXIMDR7QhH//tYZPL6pnht+vILdBw+nuiQ5Qwr0gPrvFTvo6I5y42uqU12KDGM3XlDNHdfM48Xdzbz5+8vZsLsl1SXJGdAY+jDV11hmouOYP31yG99+dBNvmj6WmZqMSwbwzgWVzJ9YzMfve55frtjOpDF5XDBlDLPGF+nYywijQA+QaNT5z4c2cs9T25hZUcgbzyk74ReDDm5JX6aUjeI3H30tn/rVCzyztZH7V+6kMGcP86qKmVtZjLtjpnAf7hTow1hXJMrh7h66IlG6eqJs2N1CblaY3MwwIQMHeqLO5v2trNjayBOb63lxVwvve201Z48dRUgfQDkF2RlhLjy7lNeeNYZNew/x7LYmntrSwPLNDTy0djeLp43lNVNKWDh5zNGzYmR4SSjQzexy4HtAGPiJu3/juPXZwM+BBUAj8C53r01uqcHT3hWhsbWL/Yc62dHURm1DO7WNbexoauflvYdo6+o5pv2dj2/p97UyQsacyiK+/o45XL9wYr+nn4kMJGTGjIpCZlQU0t4ZYf2eFhrbuvjVyp387OlaAMoLs8nJCFOSn8Wo7Axy4h2NnMwQl84cx6jsDHKzwmSFQ2RlhAiHjJ5olEjUifQ4nZEoXZEof9mwDwCz2FdmKMRb51WQFQ4RChkhMwyIRKP8fs1uolFwYle4GsaSmkqKczMpzM3U8BAJBLqZhYE7gUuBOmClmS1z9w29mt0EHHD3s83sOuCbwLsGo+CBuDuRqNMTdbp7onT3xP7tikSPPu+JOlGPffVmGL07te6xnccdetxxd3qiHP3+SNTpjkSJRKN0RqK0d/XEvjojHOqMcKgjQmtnhNaObto6e6g70H50R+6I9BxzZ3eI7dDji3KZNCaPmeMLKcnLIjcrg6yMEFlh43Vnl9IR6eFwV5Soe2xnN6gcnct5E0eTn60/uCS58rIzOL+6hBsWTaQrEmXdroOs2NbEtvo2Vm0/wLaGNtq6Isfsyw+sqjuj9zyVK1jv/Fusk2MGpaOyY7fkK8qhoiiXiqIcxhXlMCY/i8LcTIrzMsnNDJOVESIzHPsl0/sz3t0TjefGkeyI5UXUY5nhHnufkBkhg4xwiMywkRkOxb9ijzNCRjhkKRmiSiQBFgJb3H0rgJndD1wF9A70q4Avxx8/CPzQzMzdkz5ZxE+f3Ma3H9kEvPof4c7RgB4u01PkZIYYlZ1JQU4G+dlh8rMyKMzNJCsjRHZGiOyMMPnZGVw8Yyylo7KYWJJHVUne0flW+uphXzGnYqh/DJGjsjJCLJhUwoJJsRuN995HIz2x4cGO7igXTS+jtSNCRyQ2XNgZiXVAMkKxEM0IWayTkhHi8ZfqgVc/y5Ee54KzxtDdEz36mXaPheczrzTGgjL+ng7Mn1hM8+FuDrR3s6+5g93Nh9m07xB/f7me9uP+wh1qoaPhb0f/AjlS/ZfeNpPrFib/eJYNlLlmtgS43N0/GH9+I7DI3W/p1ebFeJu6+PNX4m0ajnutm4Gb40+nAZuS9HOUAg0DtkpP2jZ903bpn7ZN/4bDtpnk7mV9rRjSv9Hd/W7g7mS/rpmtcveaZL9uEGjb9E3bpX/aNv0b7tsmkQuLdgFVvZ5Xxpf12cbMMoAiYgdHRURkiCQS6CuBqWY22cyygOuAZce1WQa8N/54CfDXwRg/FxGR/g045OLuETO7BXiE2GmL97j7ejO7DVjl7suAnwK/MLMtQBOx0B9KSR/GCRBtm75pu/RP26Z/w3rbDHhQVERERgZNziUiEhAKdBGRgBjxgW5mtWa2zszWmNmqVNeTSmZ2j5ntj18XcGRZiZn9xcw2x/8dncoaU6Gf7fJlM9sV32/WmNmbU1ljqphZlZk9bmYbzGy9md0aX57W+81Jtsuw3m9G/Bi6mdUCNcdfxJSOzOwNQCvwc3efHV/2LaDJ3b9hZp8HRrv7/0plnUOtn+3yZaDV3b+dytpSzcwqgAp3f87MCoDVwNuB95HG+81Jtsu1DOP9ZsT30OVV7v4EsbOMersKuDf++F5iO2Va6We7CODue9z9ufjjQ8BGYAJpvt+cZLsMa0EIdAceNbPV8akF5Fjl7r4n/ngvUJ7KYoaZW8xsbXxIJq2GFPpiZtXAfGAF2m+OOm67wDDeb4IQ6Be6+3nAFcDH4n9eSx/iF3uN7DG25PkRcBZwLrAHuCO15aSWmY0C/gf4pLsfcx+6dN5v+tguw3q/GfGB7u674v/uB35LbHZIedW++HjgkXHB/SmuZ1hw933u3uPuUeDHpPF+Y2aZxELrv939N/HFab/f9LVdhvt+M6ID3czy4wcsMLN84DLgxZN/V9rpPS3De4Hfp7CWYeNIWMVdTZruNxabtPunwEZ3/06vVWm93/S3XYb7fjOiz3IxsynEeuUQm8Zgqbt/NYUlpZSZ3QcsJjbF5z7gS8DvgAeAicB24Fp3T6sDhP1sl8XE/mx2oBb4cK8x47RhZhcCy4F1QDS++H8TGy9O2/3mJNvleobxfjOiA11ERF41oodcRETkVQp0EZGAUKCLiASEAl1EJCAU6CIiAaFAFzlDZvZnMztoZn9MdS2S3hToIgOI3/j8ZG4HbhyKWkRORoEugWNmXzCzl83sSTO7z8z+1cz+ZmY18fWl8WmXMbOwmd1uZivjEy59OL58sZktN7NlwAYzu83MPtnrPb56ZI5sd38MODTkP6jIcQa8SbTISGJmC4jdpPxcYvv3c8Tmsu7PTUCzu59vZtnAU2b2aHzdecBsd98Wn3HvN8B3zSwUf49hNY+HiAJdgub1wG/dvR0g3sM+mcuAuWa2JP68CJgKdAHPuvs2AHevNbNGM5tPbCrZ5929cVB+ApHTpECXdBHh1SHGnF7LDfi4uz/Su7GZLQbajnuNnxC7k8844J5BqVLkDGgMXYLmCeDtZpYbn4nzbfHltcCC+OMlvdo/AnwkPlUqZnZOfObOvvwWuBw4P/59IsOKeugSKPF7QP4KeIHYHN4r46u+DTwQv6vVQ72+5SdANfBcfMrUevq53Zq7d5nZ48BBd+85stzMlgPTgVFmVgfcdHyPX2QoaLZFCbRk3gw6fjD0OeAad998pq8nkmwachFJgJnNBLYAjynMZbhSD11EJCDUQxcRCQgFuohIQCjQRUQCQoEuIhIQCnQRkYD4/1zf2vtlNtBRAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(train_df[\"query1\"].map(lambda x : len(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x118432e10>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de3hc9X3n8fd3RhdLti7WxbItyTcsg21sTLANCYGQkItJUky7QCA0Id1sSXZLN91uuyHbp2lLw/M03XZp+zRNQ8k9IcBCIE5w4pBwS0ggNka+YyFfJdmWZEuWZeuu+e4fc0QGIVkj63JGms/reeaZM7/z+535niP7fOf8zjm/Y+6OiIikn0jYAYiISDiUAERE0pQSgIhImlICEBFJU0oAIiJpKiPsAEajpKTEFy1aFHYYIiJTyiuvvHLS3UsHl0+pBLBo0SK2bdsWdhgiIlOKmR0ZqlxdQCIiaUoJQEQkTSkBiIikKSUAEZE0pQQgIpKmlABERNKUEoCISJpSAhARSVNKACIiaWpK3Qks4+uhl48mXfejVy6YwEhEJAw6AhARSVNKACIiaUoJQEQkTSkBiIikKSUAEZE0pQQgIpKmlABERNKUEoCISJpKKgGY2QYz229mtWZ2zxDzrzWz7WbWZ2Y3J5S/28yqE15dZnZTMO8bZnYoYd6a8VstEREZyYh3AptZFPgS8D6gHthqZpvcfW9CtaPAJ4A/S2zr7s8Ca4LlFAG1wE8Tqvy5uz82lhUQEZELk8xQEOuBWnc/CGBmDwMbgTcSgLsfDubFzrOcm4Efu3vHBUcrIiLjJpkuoHKgLuFzfVA2WrcB3xtUdp+Z7TSz+80se6hGZnaXmW0zs23Nzc0X8LUiIjKUSTkJbGbzgFXAloTizwGXAOuAIuCzQ7V19wfcfa27ry0tLZ3wWEVE0kUyCaABqEz4XBGUjcatwBPu3jtQ4O7HPa4b+DrxriYREZkkySSArUCVmS02syziXTmbRvk9tzOo+yc4KsDMDLgJ2D3KZYqIyBiMmADcvQ+4m3j3zT7gUXffY2b3mtmNAGa2zszqgVuAr5jZnoH2ZraI+BHE84MW/V0z2wXsAkqAL4x9dUREJFlJPRDG3TcDmweVfT5heivxrqGh2h5miJPG7v6e0QQqIiLjS3cCi4ikKSUAEZE0pQQgIpKmlABERNKUEoCISJpSAhARSVNKACIiaUoJQEQkTSkBiIikKSUAEZE0ldRQECIPvXz0LWUfvXJBCJGIyHjREYCISJpSAhARSVNKACIiaUoJQEQkTSkBiIikKSUAEZE0pQQgIpKmdB/ANKPr9UUkWUkdAZjZBjPbb2a1ZnbPEPOvNbPtZtZnZjcPmtdvZtXBa1NC+WIzezlY5iNmljX21RERkWSNmADMLAp8CbgBWAHcbmYrBlU7CnwCeGiIRXS6+5rgdWNC+ReB+919KdAKfPIC4hcRkQuUzBHAeqDW3Q+6ew/wMLAxsYK7H3b3nUAsmS81MwPeAzwWFH0TuCnpqEVEZMySSQDlQF3C5/qgLFkzzGybmb1kZgM7+WLgtLv3jbRMM7sraL+tubl5FF8rIiLnMxkngRe6e4OZLQGeMbNdQFuyjd39AeABgLVr1/oExSgiknaSOQJoACoTPlcEZUlx94bg/SDwHHA5cAooNLOBBDSqZYqIyNglkwC2AlXBVTtZwG3AphHaAGBms80sO5guAa4G9rq7A88CA1cM3Qn8YLTBi4jIhRsxAQT99HcDW4B9wKPuvsfM7jWzGwHMbJ2Z1QO3AF8xsz1B8+XANjPbQXyH/3fuvjeY91ngT82slvg5ga+O54qJiMj5JXUOwN03A5sHlX0+YXor8W6cwe1+BawaZpkHiV9hJCIiIdBQECIiaUoJQEQkTSkBiIikKSUAEZE0pQQgIpKmlABERNKUEoCISJpSAhARSVNKACIiaUoJQEQkTemZwDLuhnouMejZxCKpRkcAIiJpSglARCRNKQGIiKQpJQARkTSlBCAikqZ0FVCa23f8DK0dPUTMiJoxJz+bitm5RCMWdmgiMsGUANLYs/ubeHpv41vKczKjLJ0zi+suLmVeQU4IkYnIZFACSFP//vwBnt7byOWVhXxw1Txi7vTHnLrWTmpOtLPneBuNZ7r4zPVVmOloQGQ6SuocgJltMLP9ZlZrZvcMMf9aM9tuZn1mdnNC+Roz+7WZ7TGznWb2kYR53zCzQ2ZWHbzWjM8qyUge/MVB/u7Hr7G6ooD/dEUFM7MzyJuRSWFuFqvK42UfvHQeTe3dHDx5LuxwRWSCjJgAzCwKfAm4AVgB3G5mKwZVOwp8AnhoUHkH8HF3XwlsAP7JzAoT5v+5u68JXtUXuA4yCnUtHdy3eR8bVs7llisqiQzz6/6yykJys6K8dPDUJEcoIpMlmSOA9UCtux909x7gYWBjYgV3P+zuO4HYoPIad389mD4GNAGl4xK5XJAte07gDv/7g8vPe6I3Mxph7cIi9h47w+mOnkmMUEQmSzIJoByoS/hcH5SNipmtB7KAAwnF9wVdQ/ebWfYw7e4ys21mtq25uXm0XyuD/HRPI5fMzWNBce6Ida9cUgTAy4daJjosEQnBpNwHYGbzgG8Df+DuA0cJnwMuAdYBRcBnh2rr7g+4+1p3X1taqoOHsTh5tputR1r4wMq5SdWfnZvF8nn5bD3cQm9/bOQGIjKlJJMAGoDKhM8VQVlSzCwfeAr4C3d/aaDc3Y97XDfwdeJdTTKBfra3EXd4/8qypNu8/aJiOnr62Vl/egIjE5EwJJMAtgJVZrbYzLKA24BNySw8qP8E8C13f2zQvHnBuwE3AbtHE7iM3pY9J6iYncOKeflJt1lSMpOy/GxerD2Fu09gdCIy2UZMAO7eB9wNbAH2AY+6+x4zu9fMbgQws3VmVg/cAnzFzPYEzW8FrgU+McTlnt81s13ALqAE+MK4rpm8SXtXLy/WnuIDK+eO6rp+M+OapaWcONNFTWP7BEYoIpMtqRvB3H0zsHlQ2ecTprcS7xoa3O47wHeGWeZ7RhWpjMnzNc309MeS7v9PdFllIU/va+T5mmYunpv80YOIpDYNBpcmtuxppHhmFlcsnD3qttGIcU1VCYdPdXDklG4ME5kulADSQHdfP8++1sT7VpRd8CBvaxcWkZsV5fkaXYorMl0oAaSBX75+krPdfRfU/TMgKyPCOy4q5rUT7Zxo6xrH6EQkLEoAaeBHO49TkJPJ1UtLxrScq5YUkxWN8MLrOgoQmQ6UAKa53v4YT+9tZMPKuWRljO3PnZuVwbpFs9lZf5q2zt5xilBEwqIEMM3VNLZztruPD182b1yW9/aLSnCHlw9pkDiRqU4JYJrbWd9G0cws3r6keFyWVzQzi0vm5fObQy109faPyzJFJBxKANNYT1+M106cYcOlc8mIjt+f+h3B8BCbqo+N2zJFZPIpAUxj+xvb6e13Prx6fLp/Biwpmcnc/Bl87cVDGh5CZApTApjGdtafJi87gysXj0/3zwAze+OS0F/rgTEiU5YSwDTV3dvP/hPtrCwvuOCbv87nsspCZudm8vUXD4/7skVkcigBTFMHT56jL+asnD8xY/dkRiPcuq6SZ15r4uTZ7gn5DhGZWEoA09Shk+fIiBgLikZ+8teF+t3Ly+mPOU/tPD5h3yEiEyep0UBl6jl08hyVRblkjuPVP4NdMjefS+bm8WR1A3e+Y9GI9R96+ehbyj565YIJiExEkqEEMEUNtTMd0NXbz7HTnbz7kjkTHsfGNeV88SevceTUORYWz5zw7xOR8aMuoGnoyKkOHFg0CTvkG9fMB+AHuidAZMpRApiGDp08R9Qmtv9/QHlhDusXF/FkdYPuCRCZYpQApqFDJ89SPjtnzIO/JeumNeUcbD7H7oYzk/J9IjI+lACmmZ6+GA2nO1lcMnn98R9cNZfMqPFkdcOkfaeIjF1SCcDMNpjZfjOrNbN7hph/rZltN7M+M7t50Lw7zez14HVnQvkVZrYrWOa/2GieVC7DOtrSQcyZ1ARQmJvFdRfP4QfVx+ju0wBxIlPFiAnAzKLAl4AbgBXA7Wa2YlC1o8AngIcGtS0C/gq4ElgP/JWZDTyU9svAHwJVwWvDBa+FvOHQybNEDBZOQv9/oo9dtZCTZ7t5/BUdBYhMFckcAawHat39oLv3AA8DGxMruPthd98JxAa1/QDwtLu3uHsr8DSwwczmAfnu/pLHzxx+C7hprCsjcOhkB/MLc8jOjE7q915TVcJlFQV8+fla+mM6GSwyFSSTAMqBuoTP9UFZMoZrWx5Mj7hMM7vLzLaZ2bbmZj2K8Hx6+2PUt3ZMyuWfg5kZd7+nirqWTnbWn5707xeR0Uv5k8Du/oC7r3X3taWlpWGHk9LqWzvpi/mk9v8nuv6SOVwyN4/napqJ6ZJQkZSXTAJoACoTPlcEZckYrm1DMH0hy5RhHG3pACa//39AJGL80buX0tzezd5juiRUJNUlkwC2AlVmttjMsoDbgE1JLn8L8H4zmx2c/H0/sMXdjwNnzOyq4OqfjwM/uID4JUFdSwfFM7PIzQ5vhI8PrppHyawsnt3fpBvDRFLciAnA3fuAu4nvzPcBj7r7HjO718xuBDCzdWZWD9wCfMXM9gRtW4C/JZ5EtgL3BmUA/w14EKgFDgA/Htc1SzPuTl1LB5Uh/fofEI0Y71xayvG2LhpOd4Yai4icX1I/Fd19M7B5UNnnE6a38uYuncR6XwO+NkT5NuDS0QQrw2vr7KW9uy/0BACwqryAH+44xs76Nipmhx+PiAwt5U8CS3LqWuO/titn54QcCeRkRVlWNoud9ad1MlgkhSkBTBN1LR1kRIy5BTPCDgWA1ZWFnOnq4/Cpc2GHIiLDUAKYJupa4jeAZURS40+6fG4+mVFjR11b2KGIyDBSY28hY9IXiw8ANxnDPycrKyPC8nn57G5ooy82+AZxEUkFSgDTwIm2LvpiTkUK9P8nuqyikM7efmqbzoYdiogMQQlgGhg4AZxKRwAAVWWzyMmMsrNe3UAiqUgJYBqoa+kgLzuDgpzMsEN5k4xIhEvL89l77Aw9feoGEkk1SgDTwMANYKn4SIXLKgrp6Y+x97iGhhBJNUoAU1xHdx+nzvWkxPX/Q1lUMpPCnEyq61rDDkVEBlECmOLqWuMDwKXCHcBDiZixprKQ1xvP0t7VG3Y4IpJACWCKO9B8jmjEUnrIhTULCnFgR52eEyCSSpQApriaxnYWF88kKyN1/5Rz8mZQXpjDq0oAIikldfcaMqLTHT00tXdTVTYr7FBGdPmCQo63dXHiTFfYoYhIQAlgChu4waqqLC/kSEa2uqKQiEH1UZ0MFkkVSgBTWE1jO/kzMijLyw47lBHNys5gWVke1XUaIVQkVSgBTFH9Mae2+SzLyvJS8vr/oVy+YDZnuvo0NIRIigjv2YEyJvWtHXT1xqZE98+A5XPzmJkV5TeHWlg2QtwPvXz0LWUfvXLBRIUmkpaUAFLIaHZ6NY1nMWBpaeqfAB6QEY1wxcLZ/LL2JG2dvSk3dIVIukmqC8jMNpjZfjOrNbN7hpifbWaPBPNfNrNFQfkdZlad8IqZ2Zpg3nPBMgfmzRnPFZvuXm9qp7Iol5ysaNihjMq6RUXEHLYdaRm5sohMqBETgJlFgS8BNwArgNvNbMWgap8EWt19KXA/8EUAd/+uu69x9zXAx4BD7l6d0O6Ogfnu3jQO65MWWs710NDaybIpcPnnYMWzsqmaM4tth1vpj+lksEiYkjkCWA/UuvtBd+8BHgY2DqqzEfhmMP0YcL299czk7UFbGaNfvN6MA1Vzpk7/f6J1i4po6+ylprE97FBE0loyCaAcqEv4XB+UDVnH3fuANqB4UJ2PAN8bVPb1oPvnL4dIGACY2V1mts3MtjU3NycR7vT3/P5mcrOilKfoAHAjWT4vn7wZGfzmkLqBRMI0KZeBmtmVQIe7704ovsPdVwHXBK+PDdXW3R9w97Xuvra0tHQSok1tsZjzXE0zy8ryiEyRyz8Hi0aMtQtnU9PYTsPpzrDDEUlbySSABqAy4XNFUDZkHTPLAAqAUwnzb2PQr393bwje24GHiHc1yQh2NrTRcq6Hi6fQ5Z9DuWJhEQ78aMexsEMRSVvJJICtQJWZLTazLOI7802D6mwC7gymbwaecY/f7mlmEeBWEvr/zSzDzEqC6Uzgw8BuZETPvtZExKBqztQ7AZyoaGYW5YU5bN59IuxQRNLWiAkg6NO/G9gC7AMedfc9Znavmd0YVPsqUGxmtcCfAomXil4L1Ln7wYSybGCLme0EqokfQfzHmNcmDTy3v4k1lYXkZk/9WzgunZ/PjrrT1AfPNBCRyZXUXsTdNwObB5V9PmG6C7hlmLbPAVcNKjsHXDHKWNNec3s3O+rb+J/vWxZ2KOPi0vICtuxt5Ce7T/BfrlkSdjgiaUdjAU0hL9TEr4J69yXT45654lnZrJiXz+Zdx8MORSQtKQFMIc/ub6I0L77TnC4+uGou24+e5nibrgYSmWxKAFNEX3+MF2qauW5ZKZHI1Lz8cyg3rJoHwE90Mlhk0ikBTBGv1p3mTFfftOn+GXBR6SwuLstTN5BICJQApoif7W0kI2JcvbQk7FDG3QdXzWPbkVYa9bhIkUmlBDAFxGLOD3cc45qqkmk5hPKHVs/FHZ54dfD9hSIykZQApoCth1s41tbFTZcPHoJpelg6J4/1i4r47stHNEKoyCRSApgCnqw+Rk5mlPcuLws7lAnzsbcvpK6lk+drNCq4yGRRAkhxPX0xNu86zvtXljFzGtz9O5wPrJxLaV423/71kbBDEUkbSgAp7vmaZto6e7lpzfTs/hmQlRHh9vULeK6mmaOnNDSEyGRQAkhxT1Y3UDQzi3dWTb+rfwb76PoFRMz4zss6ChCZDNO3T2Ea6Ort52d7G7l1bSWZ0emfq+cWzOADK8t4dFsd/+O9y96yzg+9fHTIdh+9csFkhCcy7Uz/vcoUtvf4Gbr7Ytx0+fywQ5k0v3/VQk539LKj7nTYoYhMe0oAKWxn/WkqZufwtgWzww5l0rx9STGXzM3jVwdOETxSQkQmiBJAiuro6aO26SwfWj2PYR6XPC2ZGf/56sWcONPFwZPnwg5HZFpTAkhR+46fIebwoWCwtHRy45r55GZF+VXtybBDEZnWlABS1K6GNmbnZrKqvCDsUCbdjMwoVy4u5rUT7Zw62x12OCLTlhJAChro/llVXpBW3T+JrlxSRMSMXx04FXYoItOWEkAK2nss3v2zqrww7FBCkz8jk9UVBbxypJXOnv6wwxGZlpJKAGa2wcz2m1mtmd0zxPxsM3skmP+ymS0KyheZWaeZVQevf09oc4WZ7Qra/Iul60/dIew+Fu/+mV84I+xQQvWOpSX09MfYdqQl7FBEpqURE4CZRYEvATcAK4DbzWzFoGqfBFrdfSlwP/DFhHkH3H1N8Pp0QvmXgT8EqoLXhgtfjenjt90/hWnb/TOgvDCHJaUzebH2JH39sbDDEZl2kjkCWA/UuvtBd+8BHgY2DqqzEfhmMP0YcP35ftGb2Twg391f8vjF3t8Cbhp19NPQb7t/0u/k71CuWzaHM119vHpUN4aJjLdkEkA5UJfwuT4oG7KOu/cBbUBxMG+xmb1qZs+b2TUJ9etHWCYAZnaXmW0zs23Nzc1JhDu17Tl2Rt0/CS4qnUl5YQ7Pv96sZwWIjLOJPgl8HFjg7pcDfwo8ZGb5o1mAuz/g7mvdfW1paemEBJkq+vpjHDx5lovn5qd9988AM+Ndy0ppOdfD7mNtYYcjMq0kkwAagMqEzxVB2ZB1zCwDKABOuXu3u58CcPdXgAPAsqB+xQjLTDuHT3XQ2+8sK5sVdigpZcX8fEpnZfNCTbOGhxAZR8kkgK1AlZktNrMs4DZg06A6m4A7g+mbgWfc3c2sNDiJjJktIX6y96C7HwfOmNlVwbmCjwM/GIf1mdJeb2wnGjGWlCgBJIqYce2yUo63dbG/sT3scESmjRETQNCnfzewBdgHPOrue8zsXjO7Maj2VaDYzGqJd/UMXCp6LbDTzKqJnxz+tLsPXNP334AHgVriRwY/Hqd1mrJqmtpZVJxLVoZuzxhsTWUhs3Mz+fGuE7oiSGScJPU8AHffDGweVPb5hOku4JYh2j0OPD7MMrcBl44m2OnsRFsXjWe6edul6TPy52hEI8bGNeV841eHeb6mmetHeD6ynh0gMjL91EwRL9TEr3CqmpMXciSpa1lZHqsrCniuppmm9q6wwxGZ8pQAUsTzNc3kz8igLD877FBS2odWzSMzajz56jFiOiEsMiZ6JGQIBndP9MecZ15rYsV8Xf45krwZmdxw6TyeeLWB7UdaWbuoKOyQRKYsHQGkgIbWDjp7+6mao6t/knHFwtksLM7lJ3tOaKA4kTFQAkgBNU1nMWCpEkBSImb8zur5dPb088xrjWGHIzJlKQGkgP0n2qksyiU3Sz1yyZpfmMPaRUX8+uApapt0b4DIhVACCFnLuR4aTneycv6oRsgQ4H0rysjKiPA3P9yrO4RFLoASQMh2N8THt7l0vkb/HK1Z2Rlcf0kZv3j9JD/f1xR2OCJTjhJAyHYfa6O8MIfZM7PCDmVKumpJMVVzZvFXm/bQ3tUbdjgiU4oSQIhaz/VQ39qpsf/HIBox/v7m1Rxv6+S+p/aFHY7IlKIEEKKB4Y0vVQIYk8sXzOZT77qIh7fW8ex+dQWJJEsJIES7GuLdP0Xq/hmzP3lvFcvKZnHP4ztp61BXkEgylABC0toR7/7Rr//xkZ0R5R9vWcPJsz3c+6O9YYcjMiUoAYRkzxtX/+jyz/GyqqKAT79rCY9vr+fQyXNhhyOS8pQAQrKroY35BTMonqXB38bT3e+uorwwhx/uOKZnCIuMQAkgBC3neqhr7WR1RWHYoUw7OVlR/vLDyzlxpouXDp4KOxyRlKYEEIId9acBWF2h/v+J8IGVc6maM4uf7WvUvQEi56EEMMncneq60ywqzqUwV1f/TAQLBovr63d+svtE2OGIpCwlgEm29/gZmtu7uaxS3T8TqSQvm3csLaa67rSeHiYyjKQSgJltMLP9ZlZrZvcMMT/bzB4J5r9sZouC8veZ2Stmtit4f09Cm+eCZVYHrznjtVKpbFP1MSKmsX8mwzVVpWREjef2N4cdikhKGjEBmFkU+BJwA7ACuN3MVgyq9kmg1d2XAvcDXwzKTwK/4+6rgDuBbw9qd4e7rwle0/4WzljM2bTjGFVz8piZraGfJ9qs7AyuXFzMjrrTnDzbHXY4IiknmSOA9UCtux909x7gYWDjoDobgW8G048B15uZufur7n4sKN8D5JhZ2l73uPVwC8fbutT9M4muqSohGtFRgMhQkkkA5UBdwuf6oGzIOu7eB7QBxYPq/Cdgu7sn/hT7etD985c2zMNwzewuM9tmZtuam6f2f+If7DhGTmaU5fPywg4lbeTNyGT94iKq61ppOdcTdjgiKWVS+iHMbCXxbqH3JxTf4e4NZpYHPA58DPjW4Lbu/gDwAMDatWun7J09Xb39PLXzOO9bUUZ2RjTscNLKtVWl/OZQC8/tbxp23KWPXrlgkqMSCV8yRwANQGXC54qgbMg6ZpYBFACngs8VwBPAx939wEADd28I3tuBh4h3NU1bW/acoK2zl1vXVo5cWcZVfk4m6xYVsf1oK8dOd4YdjkjKSCYBbAWqzGyxmWUBtwGbBtXZRPwkL8DNwDPu7mZWCDwF3OPuLw5UNrMMMysJpjOBDwO7x7Yqqe2RrXVUzM7hHRcN7hmTyfDe5WXkZGXwZHUDMT0+UgRIIgEEffp3A1uAfcCj7r7HzO41sxuDal8Fis2sFvhTYOBS0buBpcDnB13umQ1sMbOdQDXxI4j/GM8VSyVHTp3jVwdO8ZG1lUQiQ57qkAmWkxXlw6vmUd/aqSEiRAJJnQNw983A5kFln0+Y7gJuGaLdF4AvDLPYK5IPc2p7dFsdEYOb11aEHUpaW11RwPajrTy9t5GV8wsoyMkMOySRUOlO4AnW1x/j/22r57qL5zCvICfscNKambFxTTkxd3644xiuriBJc0oAE+z5mmaa2rv5yDqd/E0FRTOzeO/yMvYeP8OLtSfDDkckVEoAE+zhrXWUzMrmPZekxUgXU8LVS0tYOT+fH+8+wf4T7WGHIxIajUcwTh56+ehbyprbu/n5vkY+9a6LyIwq16aKiBm3XFHJV84d4OGtR/mv110UdkgioVACmEA/29fIjMwon3zn4rBDkUGyMiL8/lUL+bdna/n2r4+QnRF9y0lh3Rwm051+lk6Q422d7Gpo4w+uXkSJHvuYkmbnZvH7Vy2kvbuPr7xwgJPtGjBO0osSwAT52d5GZmRGuOsadS+ksoXFM/nDdy6hty/Gv79wgPrWjrBDEpk0SgAToK6lg30n2nnn0lIKcnWteaorn53Dp951EdkZER785SF21J0OOySRSaEEMAGe3tdIblaUqzXsw5RRMiubT117EfPyZ/DItjoe315PR09f2GGJTCglgHG27/gZapvOct2yUrIzNernVJKfk8l/uWYJ1y0rZfuRVjb+64vUNukyUZm+lADGUU9fjB/uOMacvGyu0q//KSkaMd6/ci5/cPViWjt62PivL/LUzuNhhyUyIZQAxtEzrzVxurOXjWvKyYho005lS+fM4kd/fA0Xz83jjx7azt/+aC89fbGwwxIZV7oPYJw0nunil7XNvG3BbBaXzAw7HBkHcwtm8PBdb+e+p/by1V8e4pevn+SLN69m77Ezb6mrewZkKtLP1HEQizk/qG4gOyPKhkvnhh2OjKOsjAh/s/FSHvz4Wto6e/m9f3uRp3Yeo7u3P+zQRMZMRwDj4B9+up/Dpzr4vcvLmZWtTTodvXdFGeuXFPH3P3mN77x0lB31bbx3eRlXLJxNVM94kClKe6sxevyVev7tuQOsW1TEFQtnhx2OTKD8GZl84aZV5GVnsnn3cZ6sbuBXB07y3uVlxGKuh/3IlKMuoDHYeriFe76/k3dcVMyNl83HTDuAdFBZlMtd1yzhjisXEHPnod8c5YZ//gU/3HGM/pieMSBThxLABXqhpplPffsVKmfn8uU7rlA3QJoxM1bOL8IpTGgAAAo8SURBVOAz1y/j1rWV9Lvzx997lfX3/YzPPraTZ19r0o1kkvLUBTRKLed6+MKP9vL9VxtYUjqTr965TsM9pLFoxFhTWcjqigJeO36GnQ1tPFndwCPb6jBgYXEuy+fls6Aol6KZWcyemUVuVhTDMIP+mNPZ209Xbz8vHfjts4ozopF4/dwsPn3dErIzdFOhjL+kEoCZbQD+GYgCD7r73w2anw18i/hzfk8BH3H3w8G8zwGfBPqB/+7uW5JZZiqJxZxXjrby410neOLVeto6e3n3xaVcd/Ecfn3gFL8+oIeMp7uIGSvmF7BifgF9/TEOnjxHXWsHmZEIe4+f4eevNV3wfQT//PMaFpXMZNmcPJbOmcX8whzmFcxgTn42edmZzMyOMjM7g+yMiLohZVRGTABmFgW+BLwPqAe2mtkmd9+bUO2TQKu7LzWz24AvAh8xsxXAbcBKYD7wMzNbFrQZaZkTzt3p7Xe6+uK/wDq6+2nt6OF0Ry9N7V0caD5HbdNZdta3cfJsN1kZEa6tKmXFvHzmFsyYzFBlCsmIRlhWlseysrw37g9wdzp6+mk518Njr9QHZWAWv9Q0MxohI6EbsacvRmtHDy3nephbMIOaxnZqGtv56d4TDHeawYCMqJERiZCdGWFGRpQZmRGqyvIozMmkICeTWTMyyM2KkpOVQXY0QkbUiEbibSIW79qKRixYjpEVjZCVESE7I0pWRoSsaITMDHsj3oyB90i83eAE5O7EPH6kEwuewTzwHgnqRiNG1OJHRIPbx4J2MSd492BdB+pD1Ib+7vHg7vTHnH533ON/sze2t8XXIWIM+/0D8Se2d367/iO1n2jJHAGsB2rd/SCAmT0MbAQSd9Ybgb8Oph8D/tXia7MReNjdu4FDZlYbLI8kljluPv3tV3i+phnnt3+Evlhs2P9IAzIiRsmsbCpm53D98jlcUpan8X3kgpgZM7MzmJmdQVn+yD8eZmRGyc/JZGHxzDfdZNbXH6P5bDff/NURznT20t0Xo7uvn56+GL39Tl8s/t7d209XX4yu3n521bfR2dtPZ08/Pf0TezezWTwRAThv3mEm237AhbSNBjtVgjjik/am5SbGl/hdb+wfiO+4B3baoxGxC1vvgbbRiL0puVkQrRn88I/fyUWls0a/4PNIJgGUA3UJn+uBK4er4+59ZtYGFAflLw1qWx5Mj7RMAMzsLuCu4ONZM9ufRMzjpeQAjOnJ4XeMMYAR2peQRHxjjWEMyx0xvvGI7QKXkcqxQUJ8E/X3G6Ok/u2FKJXju6DYlv7tmL5z4VCFKX8S2N0fAB4I47vNbJu7rw3ju5Oh+C5cKscGim+sUjm+VIotmctAG4DKhM8VQdmQdcwsAyggfjJ4uLbJLFNERCZQMglgK1BlZovNLIv4Sd1Ng+psAu4Mpm8GnnF3D8pvM7NsM1sMVAG/SXKZIiIygUbsAgr69O8GthC/ZPNr7r7HzO4Ftrn7JuCrwLeDk7wtxHfoBPUeJX5ytw/4I3fvBxhqmeO/emMWStfTKCi+C5fKsYHiG6tUji9lYjO/kNPVIiIy5WkoCBGRNKUEICKSppQAhmFmh81sl5lVm9m2FIjna2bWZGa7E8qKzOxpM3s9eA9lPOphYvtrM2sItl+1mX0wjNiCWCrN7Fkz22tme8zsM0F56NvvPLGlxPYzsxlm9hsz2xHE9zdB+WIze9nMas3skeBijlSK7xtmdihh+60JI76EOKNm9qqZ/Sj4nBLbTwng/N7t7mtS5JrdbwAbBpXdA/zc3auAnwefw/AN3hobwP3B9lvj7psnOaZEfcD/dPcVwFXAHwXDlKTC9hsuNkiN7dcNvMfdLwPWABvM7Criw73c7+5LgVbiw8GkUnwAf56w/apDim/AZ4B9CZ9TYvspAUwR7v4C8SusEm0EvhlMfxO4aVKDCgwTW8pw9+Puvj2Ybif+H7GcFNh+54ktJXjc2eBjZvBy4D3Eh32BcP/tDRdfyjCzCuBDwIPBZyNFtp8SwPAc+KmZvRIMR5GKytz9eDB9AigLM5gh3G1mO4MuopR4XJqZLQIuB14mxbbfoNggRbZf0H1RDTQBTwMHgNPuPvDAg8QhXkKPz90Htt99wfa73+IjFofln4D/BQwMxFRMimw/JYDhvdPd3wbcQPyw/NqwAzqf4Ma7VPrl82XgIuKH5ceBfww3HDCzWcDjwJ+4+5nEeWFvvyFiS5nt5+797r6G+B3764FLwoplKIPjM7NLgc8Rj3MdUAR8NozYzOzDQJO7vxLG949ECWAY7t4QvDcBT/DbUUxTSaOZzQMI3ptCjucN7t4Y/MeMAf9ByNvPzDKJ72C/6+7fD4pTYvsNFVuqbb8gptPAs8DbgcJg2BdIkaFcEuLbEHSteTAS8dcJb/tdDdxoZoeBh4l3/fwzKbL9lACGYGYzzSxvYBp4P7D7/K1CkTgEx53AD0KM5U0GdqyB3yXE7Rf0uX4V2Ofu/zdhVujbb7jYUmX7mVmpmRUG0znEn+Gxj/iO9uagWmj/9oaJ77WExG7E+9dD2X7u/jl3r3D3RcRHSHjG3e8gVbaf7gR+KzNbQvxXP8SHy3jI3e8LMSTM7HvAdcSHkm0E/gp4EngUWAAcAW5190k/GTtMbNcR775w4DDwqYT+9smO753AL4Bd/LYf9n8T72sPdfudJ7bbSYHtZ2ariZ+kjBL/wfiou98b/B95mHj3yqvA7we/tlMlvmeAUuLD/1cDn044WRwKM7sO+DN3/3DKbD8lABGR9KQuIBGRNKUEICKSppQARETSlBKAiEiaUgIQEUlTSgAik8zM1pjZr4PRK3ea2UfCjknSky4DFZkAZpaRMNbL4HnLiI8+8bqZzQdeAZYHd7KKTBodAYgAZvYXZlZjZr80s++Z2Z+Z2XNmtjaYXxLczj8w+Nj/MbOtwS/4TwXl15nZL8xsE7DXzO41sz9J+I77zOwz7l7j7q8DuPsx4kNQlE72OouM+FB4kenOzK4gfpv+GuL/J7YT/1U+nE8Cbe6+Lhhl8kUz+2kw723Ape5+KBjd8/vAP5lZJPiON41JY2brgSziI2yKTColABG4BnjC3TsAgl/w5/N+YLWZDYzlUgBUAT3Ab9z9EIC7HzazU2Z2OfGhpl9191MDCwnGq/k2cGcw6JvIpFICEBleH7/tJp2RUG7AH7v7lsTKwVgv5wYt40HgE8Bc4GsJdfOBp4C/cPeXxjVqkSTpHIAIvADcZGY5wSiwvxOUHwauCKZvTqi/BfivwTDOmNmyYNTYoTxB/HGZ64J2BM9/fQL4lrs/Nkw7kQmnIwBJe+6+3cweAXYQPyG7NZj1D8CjwRPhnkpo8iCwCNgeDDfczDCP9HP3HjN7lvgToPqD4luBa4FiM/tEUPaJFHhuraQZXQYqMoiZ/TVw1t3/YRyWFSF+UvmWgSt/RFKFuoBEJoiZrQBqgZ9r5y+pSEcAIiJpSkcAIiJpSglARCRNKQGIiKQpJQARkTSlBCAikqb+P1RFdtCiIlHhAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(train_df[\"query2\"].map(lambda x : len(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(train_df[\"query2\"].map(lambda x : len(x)),100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['剧烈运动后咯血,是怎么了?', '剧烈运动后咯血,是怎么了?', '剧烈运动后咯血,是怎么了?', ...,\n",
       "       '过敏性哮喘吃什么药管用', '过敏性哮喘吃什么药管用', '过敏性哮喘吃什么药管用'], dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[\"query1\"].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 正例负例还算均匀"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    5250\n",
       "1.0    3497\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['label'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 不同问题类别分布比例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "感冒        1921\n",
       "上呼吸道感染    1597\n",
       "肺炎        1475\n",
       "肺气肿        980\n",
       "哮喘         875\n",
       "胸膜炎        795\n",
       "支原体肺炎      790\n",
       "咳血         320\n",
       "Name: category, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[\"category\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>query1</th>\n",
       "      <th>query2</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>支原体肺炎</td>\n",
       "      <td>小儿支原体肺炎怎样能彻底治愈</td>\n",
       "      <td>小儿支原体肺炎如何根治？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>支原体肺炎</td>\n",
       "      <td>小儿支原体肺炎怎样能彻底治愈</td>\n",
       "      <td>小儿支原体肺炎怎样可以彻底治好？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>352</th>\n",
       "      <td>支原体肺炎</td>\n",
       "      <td>小儿支原体肺炎怎样能彻底治愈</td>\n",
       "      <td>小孩支原体肺炎多久能痊愈？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>支原体肺炎</td>\n",
       "      <td>小儿支原体肺炎怎样能彻底治愈</td>\n",
       "      <td>如何诊断小儿支原体肺炎？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>支原体肺炎</td>\n",
       "      <td>小儿支原体肺炎怎样能彻底治愈</td>\n",
       "      <td>小儿支原体肺炎有哪些病症？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8052</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>请问，怎么可以治疗哮喘病</td>\n",
       "      <td>哮喘病发吃什么有效？</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     category          query1            query2  label\n",
       "350     支原体肺炎  小儿支原体肺炎怎样能彻底治愈      小儿支原体肺炎如何根治？    NaN\n",
       "351     支原体肺炎  小儿支原体肺炎怎样能彻底治愈  小儿支原体肺炎怎样可以彻底治好？    NaN\n",
       "352     支原体肺炎  小儿支原体肺炎怎样能彻底治愈     小孩支原体肺炎多久能痊愈？    NaN\n",
       "353     支原体肺炎  小儿支原体肺炎怎样能彻底治愈      如何诊断小儿支原体肺炎？    NaN\n",
       "354     支原体肺炎  小儿支原体肺炎怎样能彻底治愈     小儿支原体肺炎有哪些病症？    NaN\n",
       "8052       哮喘    请问，怎么可以治疗哮喘病        哮喘病发吃什么有效？    NaN"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_nan_df = train_df.loc[(train_df[\"label\"]!=1) & (train_df[\"label\"]!=0),:]\n",
    "label_nan_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([350, 351, 352, 353, 354, 8052], dtype='int64')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_nan_df.index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 删除label为nan的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 8747 entries, 0 to 8752\n",
      "Data columns (total 4 columns):\n",
      " #   Column    Non-Null Count  Dtype \n",
      "---  ------    --------------  ----- \n",
      " 0   category  8747 non-null   object\n",
      " 1   query1    8747 non-null   object\n",
      " 2   query2    8747 non-null   object\n",
      " 3   label     8747 non-null   int64 \n",
      "dtypes: int64(1), object(3)\n",
      "memory usage: 341.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train_df = train_df.drop(index=label_nan_df.index,axis=0)\n",
    "train_df[\"label\"] = train_df[\"label\"].map(lambda x :int(x))\n",
    "train_df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 切分dev一部分出来作为test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>query1</th>\n",
       "      <th>query2</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>咳血</td>\n",
       "      <td>请问呕血与咯血有什么区别？</td>\n",
       "      <td>请问呕血与咯血这两者之间有什么区别？</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>咳血</td>\n",
       "      <td>请问呕血与咯血有什么区别？</td>\n",
       "      <td>请问呕血与咯血异同？</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>咳血</td>\n",
       "      <td>请问呕血与咯血有什么区别？</td>\n",
       "      <td>请问呕血与咯血怎么治疗？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>咳血</td>\n",
       "      <td>请问呕血与咯血有什么区别？</td>\n",
       "      <td>请问呕血与咯血是什么原因导致的？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>咳血</td>\n",
       "      <td>请问呕血与咯血有什么区别？</td>\n",
       "      <td>请问呕血与咯血与其他疾病有关联吗？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1997</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>变应性哮喘就是过敏性哮喘吗？</td>\n",
       "      <td>变应性哮喘与过敏性哮喘一样吗？</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1998</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>变应性哮喘就是过敏性哮喘吗？</td>\n",
       "      <td>变应性哮喘是否就是过敏性哮喘？</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1999</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>变应性哮喘就是过敏性哮喘吗？</td>\n",
       "      <td>变应性哮喘的饮食禁忌有哪些？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>变应性哮喘就是过敏性哮喘吗？</td>\n",
       "      <td>变应性哮喘怎么治疗？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2001</th>\n",
       "      <td>哮喘</td>\n",
       "      <td>变应性哮喘就是过敏性哮喘吗？</td>\n",
       "      <td>变应性哮喘能跑步吗？</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2002 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     category          query1              query2  label\n",
       "0          咳血   请问呕血与咯血有什么区别？  请问呕血与咯血这两者之间有什么区别？      1\n",
       "1          咳血   请问呕血与咯血有什么区别？          请问呕血与咯血异同？      1\n",
       "2          咳血   请问呕血与咯血有什么区别？        请问呕血与咯血怎么治疗？      0\n",
       "3          咳血   请问呕血与咯血有什么区别？    请问呕血与咯血是什么原因导致的？      0\n",
       "4          咳血   请问呕血与咯血有什么区别？   请问呕血与咯血与其他疾病有关联吗？      0\n",
       "...       ...             ...                 ...    ...\n",
       "1997       哮喘  变应性哮喘就是过敏性哮喘吗？     变应性哮喘与过敏性哮喘一样吗？      1\n",
       "1998       哮喘  变应性哮喘就是过敏性哮喘吗？     变应性哮喘是否就是过敏性哮喘？      1\n",
       "1999       哮喘  变应性哮喘就是过敏性哮喘吗？      变应性哮喘的饮食禁忌有哪些？      0\n",
       "2000       哮喘  变应性哮喘就是过敏性哮喘吗？          变应性哮喘怎么治疗？      0\n",
       "2001       哮喘  变应性哮喘就是过敏性哮喘吗？          变应性哮喘能跑步吗？      0\n",
       "\n",
       "[2002 rows x 4 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dev_df = pd.read_csv(\"origin-data/dev.csv\")\n",
    "dev_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2002 entries, 0 to 2001\n",
      "Data columns (total 4 columns):\n",
      " #   Column    Non-Null Count  Dtype \n",
      "---  ------    --------------  ----- \n",
      " 0   category  2002 non-null   object\n",
      " 1   query1    2002 non-null   object\n",
      " 2   query2    2002 non-null   object\n",
      " 3   label     2002 non-null   int64 \n",
      "dtypes: int64(1), object(3)\n",
      "memory usage: 62.7+ KB\n"
     ]
    }
   ],
   "source": [
    "dev_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "上呼吸道感染    425\n",
       "感冒        379\n",
       "肺炎        323\n",
       "哮喘        230\n",
       "胸膜炎       225\n",
       "肺气肿       195\n",
       "支原体肺炎     165\n",
       "咳血         60\n",
       "Name: category, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dev_df[\"category\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "dev_index,test_index,_,_ = train_test_split(dev_df.index,dev_df[\"label\"],test_size=0.3,random_state=2020,stratify=dev_df[\"label\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1401 entries, 1796 to 395\n",
      "Data columns (total 4 columns):\n",
      " #   Column    Non-Null Count  Dtype \n",
      "---  ------    --------------  ----- \n",
      " 0   category  1401 non-null   object\n",
      " 1   query1    1401 non-null   object\n",
      " 2   query2    1401 non-null   object\n",
      " 3   label     1401 non-null   int64 \n",
      "dtypes: int64(1), object(3)\n",
      "memory usage: 54.7+ KB\n"
     ]
    }
   ],
   "source": [
    "new_dev_df = dev_df.iloc[dev_index,:]\n",
    "new_dev_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 601 entries, 1482 to 742\n",
      "Data columns (total 4 columns):\n",
      " #   Column    Non-Null Count  Dtype \n",
      "---  ------    --------------  ----- \n",
      " 0   category  601 non-null    object\n",
      " 1   query1    601 non-null    object\n",
      " 2   query2    601 non-null    object\n",
      " 3   label     601 non-null    int64 \n",
      "dtypes: int64(1), object(3)\n",
      "memory usage: 23.5+ KB\n"
     ]
    }
   ],
   "source": [
    "test_df = dev_df.iloc[test_index,:]\n",
    "test_df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 重新存入切分后的数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 标点符号转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def replace_punc(query):\n",
    "    query = re.sub(',','，',query)\n",
    "    query = re.sub('\\?','？',query)\n",
    "    query = re.sub('\\.','。',query)\n",
    "    query = re.sub('!','！',query)\n",
    "    return query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/envs/tf2_py37/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  after removing the cwd from sys.path.\n",
      "/opt/anaconda3/envs/tf2_py37/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"\n",
      "/opt/anaconda3/envs/tf2_py37/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  import sys\n",
      "/opt/anaconda3/envs/tf2_py37/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "train_df['query1'] = train_df.loc[:,\"query1\"].map(replace_punc)\n",
    "train_df['query2'] = train_df.loc[:,\"query2\"].map(replace_punc)\n",
    "\n",
    "new_dev_df['query1'] = new_dev_df.loc[:,\"query1\"].map(replace_punc)\n",
    "new_dev_df['query2'] = new_dev_df.loc[:,\"query2\"].map(replace_punc)\n",
    "\n",
    "test_df['query1'] = test_df.loc[:,\"query1\"].map(replace_punc)\n",
    "test_df['query2'] = test_df.loc[:,\"query2\"].map(replace_punc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 重新存储"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv(\"shuffle-data/train_data.csv\",index=False)\n",
    "new_dev_df.to_csv(\"shuffle-data/dev_data.csv\",index=False)\n",
    "test_df.to_csv(\"shuffle-data/test_data.csv\",index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf2_py37",
   "language": "python",
   "name": "tf2_py37"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc-autonumbering": true
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
